{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NYC Airbnb Price Prediction - Data Preparation\n",
    "\n",
    "Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.\n",
    "\n",
    "\n",
    "This notebook contains the common data loading and preparation steps:\n",
    "- load data from the input CSV\n",
    "- fix missing values\n",
    "- clean up anomalies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Common imports and variables\n",
    "Imports and variable definitions that are common to the entire notebook\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: requests in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (2.22.0)"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 19.0.3, however version 20.2b1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2019.6.16)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (1.25.3)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (3.0.4)\n",
      "Requirement already satisfied: xlrd in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (1.2.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 19.0.3, however version 20.2b1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    }
   ],
   "source": [
    "!pip install requests\n",
    "!pip install xlrd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import matplotlib.pyplot as plt\n",
    "import datetime as dt\n",
    "# common imports\n",
    "import zipfile\n",
    "import time\n",
    "# import datetime, timedelta\n",
    "import datetime\n",
    "from datetime import datetime, timedelta\n",
    "from datetime import date\n",
    "from dateutil import relativedelta\n",
    "from io import StringIO\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from sklearn.base import BaseEstimator\n",
    "from sklearn.base import TransformerMixin\n",
    "from io import StringIO\n",
    "import requests\n",
    "import json\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline \n",
    "import os\n",
    "import math\n",
    "from subprocess import check_output\n",
    "from IPython.display import display\n",
    "import logging\n",
    "import yaml\n",
    "from collections import Counter\n",
    "import re\n",
    "import os\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "current directory is: C:\\personal\\manning\\second_example_june_2020\\airbnb\\dl_example_2\\notebooks\n",
      "path_to_yaml C:\\personal\\manning\\second_example_june_2020\\airbnb\\dl_example_2\\notebooks\\airbnb_data_preparation_config.yml\n"
     ]
    }
   ],
   "source": [
    "# load config file\n",
    "current_path = os.getcwd()\n",
    "print(\"current directory is: \"+current_path)\n",
    "\n",
    "path_to_yaml = os.path.join(current_path, 'airbnb_data_preparation_config.yml')\n",
    "print(\"path_to_yaml \"+path_to_yaml)\n",
    "try:\n",
    "    with open (path_to_yaml, 'r') as c_file:\n",
    "        config = yaml.safe_load(c_file)\n",
    "except Exception as e:\n",
    "    print('Error reading the config file')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "# common variables\n",
    "# control whether to load data from scratch from original source or from saved dataframe\n",
    "load_from_scratch = config['general']['load_from_scratch']\n",
    "# control whether to save dataframe with transformed data\n",
    "save_transformed_dataframe = config['general']['save_transformed_dataframe']\n",
    "# control whether rows containing erroneous values are removed from the saved dataset\n",
    "remove_bad_values = config['general']['remove_bad_values']\n",
    "# load default replacements for missing values\n",
    "text_default = config['general']['text_default']\n",
    "categorical_default = config['general']['categorical_default']\n",
    "time_default = config['general']['time_default']\n",
    "continuous_default = config['general']['continuous_default']\n",
    "# original CSV version of input (unprocessed) dataset\n",
    "input_csv = config['file_names']['input_csv']\n",
    "# saved pickled version of input dataset\n",
    "pickled_input_dataframe = config['file_names']['pickled_input_dataframe']\n",
    "# name of file to which prepared data set is saved as a pickled dataframe\n",
    "pickled_output_dataframe = config['file_names']['pickled_output_dataframe']\n",
    "# load lists of column categories\n",
    "collist = config['categorical']\n",
    "textcols = config['text']\n",
    "continuouscols = config['continuous']\n",
    "excludefromcolist = config['excluded']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_from_scratch True\n",
      "save_transformed_dataframe True\n",
      "remove_bad_values True\n",
      "pickled_input_dataframe AB_NYC_2019_df.pkl\n",
      "pickled_output_dataframe AB_NYC_2019_remove_bad_values_jun21_2020.pkl\n",
      "defaults for text categorical time continuous are missing, missing, 2019-01-01, 0.0\n",
      "collist is:  ['neighbourhood_group', 'neighbourhood', 'room_type']\n",
      "textcols is:  ['name', 'host_name']\n",
      "continuouscols is:  ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']\n",
      "excludefromcolist is:  ['price', 'id', 'latitude', 'longitude', 'name', 'host_name', 'last_review']\n"
     ]
    }
   ],
   "source": [
    "print(\"load_from_scratch \"+str(load_from_scratch))\n",
    "print(\"save_transformed_dataframe \"+str(save_transformed_dataframe))\n",
    "print(\"remove_bad_values \"+str(remove_bad_values))\n",
    "print(\"pickled_input_dataframe \"+str(pickled_input_dataframe))\n",
    "print(\"pickled_output_dataframe \"+str(pickled_output_dataframe))\n",
    "print(\"defaults for text categorical time continuous are \"+text_default+\", \"+categorical_default+\", \"+str(time_default)+\", \"+str(continuous_default))\n",
    "print(\"collist is: \",str(collist))\n",
    "print(\"textcols is: \",str(textcols))\n",
    "print(\"continuouscols is: \",str(continuouscols))\n",
    "print(\"excludefromcolist is: \",str(excludefromcolist))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data\n",
    "- ingest CVS into a Pandas dataframe "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the directory for that this notebook is in and return the directory containing data files\n",
    "\n",
    "def get_path():\n",
    "    rawpath = os.getcwd()\n",
    "    # data is in a directory called \"data\" that is a sibling to the directory containing the notebook\n",
    "    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))\n",
    "    return(path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# given a path return the list of xls files in the directory\n",
    "def get_xls_list(path):\n",
    "    files = os.listdir(path)\n",
    "    files_xls = [f for f in files if f[-4:] == 'xlsx']\n",
    "    print(files)\n",
    "    print(files_xls)\n",
    "    return(files_xls)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define categories for input columns\n",
    "def define_feature_categories(df):\n",
    "    allcols = list(df)\n",
    "    print(\"all cols\",allcols)\n",
    "    textcols = ['name','host_name'] # \n",
    "    continuouscols = ['price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365'] \n",
    "                      # columns to deal with as continuous values - no embeddings\n",
    "    timecols = ['last_review']\n",
    "    collist = ['neighbourhood_group','neighbourhood','room_type']\n",
    "    for col in continuouscols:\n",
    "        df[col] = df[col].astype(float)\n",
    "    print('texcols: ',textcols)\n",
    "    print('continuouscols: ',continuouscols)\n",
    "    print('timecols: ',timecols)\n",
    "    print('collist: ',collist)\n",
    "    return(allcols,textcols,continuouscols,timecols,collist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill missing values according to the column category\n",
    "def fill_missing(dataset,allcols,textcols,continuouscols,timecols,collist):\n",
    "    logging.debug(\"before mv\")\n",
    "    for col in collist:\n",
    "        dataset[col].fillna(value=categorical_default, inplace=True)\n",
    "    for col in continuouscols:\n",
    "        dataset[col].fillna(value=continuous_default,inplace=True)\n",
    "    for col in timecols:\n",
    "        dataset[col].fillna(value=time_default,inplace=True)\n",
    "    for col in textcols:\n",
    "        dataset[col].fillna(value=text_default, inplace=True)\n",
    "    return (dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load dataframe\n",
    "- load pickled dataframe\n",
    "- show info about the dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read in data, either from original CSV file in data directory or from saved pickled dataframe\n",
    "def ingest_data(path):\n",
    "    if load_from_scratch:\n",
    "        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) \n",
    "    else:\n",
    "        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))\n",
    "        logging.debug(\"reloader done\")\n",
    "    return(unpickled_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# General cleanup\n",
    "- correct types for Route and Vehicle\n",
    "- fill missing values\n",
    "- create report-date-time index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the dataset incorporated some anomalies in the 2019 data, including:\n",
    "# extraneous Incident ID in April 2019 tab\n",
    "# Gap and Delay columns in April and June 2019 tabs for what had otherwise been called Min Gap and Min Delay\n",
    "# this function cleans up these anomalies\n",
    "def fix_anomalous_columns(df):\n",
    "    # for rows where there is NaN in the Min Delay or Min Gap columns, copy over value from Delay or Gap\n",
    "    # df.Temp_Rating.fillna(df.Farheit, inplace=True)\n",
    "    df['Min Delay'].fillna(df['Delay'], inplace=True)\n",
    "    df['Min Gap'].fillna(df['Gap'], inplace=True)\n",
    "    # now that the useful values have been copied from Delay and Gap, remove them\n",
    "    del df['Delay']\n",
    "    del df['Gap']\n",
    "    # remove Incident ID column - it's extraneous\n",
    "    del df['Incident ID']\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_time(date_time_value,time_value):\n",
    "    ''' given a datetime replace the time portion '''\n",
    "     \n",
    "    date_time_value = date_time_value.replace(hour=time_value.hour,minute=time_value.minute,second=time_value.minute)\n",
    "    return(date_time_value)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def general_cleanup(df):\n",
    "    # ensure Route and Vehicle are strings, not numeric\n",
    "    df['Route'] = df['Route'].astype(str)\n",
    "    df['Vehicle'] = df['Vehicle'].astype(str)\n",
    "    # remove extraneous characters left from Vehicle values being floats\n",
    "    df['Vehicle'] = df['Vehicle'].str[:-2]\n",
    "    # tactical definition of categories\n",
    "    allcols,textcols,continuouscols,timecols,collist = define_feature_categories(df)\n",
    "    # fill in missing values\n",
    "    df.isnull().sum(axis = 0)\n",
    "    df = fix_anomalous_columns(df)\n",
    "    df = fill_missing(df,allcols,textcols,continuouscols,timecols,collist)\n",
    "    # create new column combining date + time (needed for resampling) and make it the index\n",
    "    df['Report Date Time'] = df.apply(lambda x: replace_time(x['Report Date'], x['Time']), axis=1)\n",
    "    df.index = df['Report Date Time']\n",
    "    # return the updated dataframe along with the column category lists\n",
    "    return(df,allcols,textcols,continuouscols,timecols,collist)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up selected columns\n",
    "Some values in the input dataset were entered \"free form\" when they should have been constricted to a pick list. Columns with this problem include:\n",
    "\n",
    "- Route\n",
    "- Vehicle\n",
    "- Direction\n",
    "- Location\n",
    "\n",
    "\n",
    "Each of these have a finite set of valid values. We have to fix the data in these columns where multiple tokens have been used to signify the same real-world entity (e.g. \"roncesvalles yard.\" and \"roncesvalles carhouse\", or where incorrect values have been entered (e.g. Direction that does not correspond with a compass point)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Route"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_route (x):\n",
    "    if x in valid_routes:\n",
    "        return(x)\n",
    "    else:\n",
    "        return(\"bad route\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def route_cleanup(df):\n",
    "    print(\"Route count pre cleanup\",df['Route'].nunique())\n",
    "    # df['Route'].value_counts()\n",
    "    # replace bad route with common token\n",
    "    df['Route'] = df['Route'].apply(lambda x:check_route(x))\n",
    "    print(\"route count post cleanup\",df['Route'].nunique())\n",
    "    return(df)    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Vehicle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_vehicle (x):\n",
    "    if str.isdigit(x):\n",
    "        if int(x) in valid_vehicles:\n",
    "            return x\n",
    "        else:\n",
    "            return(\"bad vehicle\")\n",
    "    else:\n",
    "        return(\"bad vehicle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def vehicle_cleanup(df):\n",
    "    print(\"Vehicle count pre cleanup\",df['Vehicle'].nunique())\n",
    "    df['Vehicle'] = df['Vehicle'].apply(lambda x:check_vehicle(x))\n",
    "    print(\"Vehicle count post cleanup\",df['Vehicle'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Direction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_direction (x):\n",
    "    if x in valid_directions:\n",
    "        return(x)\n",
    "    else:\n",
    "        return(\"bad direction\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def direction_cleanup(df):\n",
    "    print(\"Direction count pre cleanup\",df['Direction'].nunique())\n",
    "    df['Direction'] = df['Direction'].str.lower()\n",
    "    df['Direction'] = df['Direction'].str.replace('/','')\n",
    "    df['Direction'] = df['Direction'].replace({'eastbound':'e','westbound':'w','southbound':'s','northbound':'n'})\n",
    "    df['Direction'] = df['Direction'].replace('b','',regex=True)\n",
    "    df['Direction'] = df['Direction'].apply(lambda x:check_direction(x))\n",
    "    print(\"Direction count post cleanup\",df['Direction'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_conjunction(intersection):\n",
    "    intersection = re.sub(\" *& *\",\" and \",intersection)\n",
    "    intersection = re.sub(\" */ *\",\" and \",intersection)\n",
    "    return(intersection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def order_location(intersection):\n",
    "    # for any string with the format \"* and *\" if the value before the and is alphabetically\n",
    "    # higher than the value after the and, swap the values\n",
    "    conj = \" and \"\n",
    "    alpha_ordered_intersection = intersection\n",
    "    if conj in intersection:\n",
    "        end_first_street = intersection.find(conj)\n",
    "        if (end_first_street > 0) and (len(intersection) > (end_first_street + len(conj))):\n",
    "            start_second_street = intersection.find(conj) + len(conj)\n",
    "            first_street = intersection[0:end_first_street]\n",
    "            second_street = intersection[start_second_street:]\n",
    "            alpha_ordered_intersection = min(first_street,second_street)+conj+max(first_street,second_street)\n",
    "    return(alpha_ordered_intersection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def location_cleanup(df):\n",
    "    print(\"Location count pre cleanup\",df['Location'].nunique())\n",
    "    # make all location values lower case\n",
    "    df['Location'] = df['Location'].str.lower()\n",
    "    # make substitutions to eliminate obvious duplicate tokens\n",
    "    df['Location'] = df['Location'].replace({'broadviewstation':'broadview station',' at ':' and ',' stn':' station',' ave.':'','/':' and ','roncy':'roncesvalles','carhouse':'yard','yard.':'yard','st. clair':'st clair','ronc. ':'roncesvalles ','long branch':'longbranch','garage':'yard','barns':'yard',' & ':' and '}, regex=True)\n",
    "    # put intersection values into consistent order\n",
    "    df['Location'] = df['Location'].apply(lambda x:order_location(x))\n",
    "    print(\"Location count post cleanup\",df['Location'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove bad rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove rows with bad values\n",
    "def remove_bad(df):\n",
    "    df = df[df.Vehicle != 'bad vehicle']\n",
    "    df = df[df.Direction != 'bad direction']\n",
    "    df = df[df.Route != 'bad route']\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Master cell\n",
    "This cell contains calls to the other functions in this notebook to complete the data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# master cell to call the other functions\n",
    "# get the path for data files\n",
    "path = get_path()\n",
    "print(\"path is \",path)\n",
    "# load route direction and delay data datframes\n",
    "df = ingest_data(path)\n",
    "allcols,textcols,continuouscols,timecols,collist = define_feature_categories(df)\n",
    "# iterate through columns to get basic information\n",
    "for col in list(df):\n",
    "    print(\"Missing values in \",col,\" \",str(df[col].isna().sum()))\n",
    "    print(\"Distinct values \",str(df[col].nunique()))\n",
    "df = fill_missing(df,allcols,textcols,continuouscols,timecols,collist)\n",
    "df.head()\n",
    "'''\n",
    "print(\"number of records: \",len(df.index))\n",
    "print(\"df.info() output\",df.info())\n",
    "print(\"df.shape output\",df.shape)\n",
    "print(\"df.describe() output\",df.describe())\n",
    "print(\"df.types output\",df.dtypes)\n",
    "df,allcols,textcols,continuouscols,timecols,collist = general_cleanup(df)\n",
    "df.head()\n",
    "# get record count by year\n",
    "from collections import Counter\n",
    "df_year = pd.DatetimeIndex(df['Report Date Time']).year\n",
    "print(\"record count by year pre processing: \", str(Counter(df_year)))\n",
    "# check that the values for April 2019 are correct\n",
    "df[df['Report Date Time'].astype(str).str[:7]=='2019-04']\n",
    "# cleanup Route\n",
    "logging.debug(\"df.shape output pre route cleanup\",df.shape)\n",
    "df = route_cleanup(df) \n",
    "df = vehicle_cleanup(df)\n",
    "df = direction_cleanup(df)\n",
    "df = location_cleanup(df)\n",
    "logging.debug(\"df.shape output post location\",df.shape)\n",
    "print(\"Bad route count pre:\",df[df.Route == 'bad route'].shape[0])\n",
    "print(\"Bad direction count pre:\",df[df.Direction == 'bad direction'].shape[0])\n",
    "print(\"Bad vehicle count pre:\",df[df.Vehicle == 'bad vehicle'].shape[0])\n",
    "if remove_bad_values:\n",
    "    df = remove_bad(df)\n",
    "print(\"Bad route count:\",df[df.Route == 'bad route'].shape[0])\n",
    "print(\"Bad direction count:\",df[df.Direction == 'bad direction'].shape[0])\n",
    "print(\"Bad vehicle count:\",df[df.Vehicle == 'bad vehicle'].shape[0])\n",
    "# pickle the cleansed dataframe\n",
    "print(\"df.shape output post removal of bad records \",df.shape)\n",
    "'''\n",
    "if save_transformed_dataframe:\n",
    "    print(\"path is \",path)\n",
    "    file_name = os.path.join(path,pickled_output_dataframe)\n",
    "    print(\"file_name is \",file_name)\n",
    "    df.to_pickle(file_name)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
